1

1.1 Outlier analysis I

Consider the data set USJudgeRatings in R

# loading the data set 
data("USJudgeRatings")
# store in a shorter variable name USJR
USJR <-  USJudgeRatings

(a) Check the description of the data.

# help command ? to retrieve metadata
?USJudgeRatings
# correlation matrix
pairs(USJR, col=6, pch=8)

# using the pipe-operator %>% to establish a pipeline of functions
USJR %>% summary()
##       CONT             INTG            DMNR            DILG      
##  Min.   : 5.700   Min.   :5.900   Min.   :4.300   Min.   :5.100  
##  1st Qu.: 6.850   1st Qu.:7.550   1st Qu.:6.900   1st Qu.:7.150  
##  Median : 7.300   Median :8.100   Median :7.700   Median :7.800  
##  Mean   : 7.437   Mean   :8.021   Mean   :7.516   Mean   :7.693  
##  3rd Qu.: 7.900   3rd Qu.:8.550   3rd Qu.:8.350   3rd Qu.:8.450  
##  Max.   :10.600   Max.   :9.200   Max.   :9.000   Max.   :9.000  
##       CFMG            DECI            PREP            FAMI      
##  Min.   :5.400   Min.   :5.700   Min.   :4.800   Min.   :5.100  
##  1st Qu.:7.000   1st Qu.:7.100   1st Qu.:6.900   1st Qu.:6.950  
##  Median :7.600   Median :7.700   Median :7.700   Median :7.600  
##  Mean   :7.479   Mean   :7.565   Mean   :7.467   Mean   :7.488  
##  3rd Qu.:8.050   3rd Qu.:8.150   3rd Qu.:8.200   3rd Qu.:8.250  
##  Max.   :8.700   Max.   :8.800   Max.   :9.100   Max.   :9.100  
##       ORAL            WRIT            PHYS            RTEN      
##  Min.   :4.700   Min.   :4.900   Min.   :4.700   Min.   :4.800  
##  1st Qu.:6.850   1st Qu.:6.900   1st Qu.:7.700   1st Qu.:7.150  
##  Median :7.500   Median :7.600   Median :8.100   Median :7.800  
##  Mean   :7.293   Mean   :7.384   Mean   :7.935   Mean   :7.602  
##  3rd Qu.:8.000   3rd Qu.:8.050   3rd Qu.:8.500   3rd Qu.:8.250  
##  Max.   :8.900   Max.   :9.000   Max.   :9.100   Max.   :9.200
USJR %>% glimpse()
## Rows: 43
## Columns: 12
## $ CONT <dbl> 5.7, 6.8, 7.2, 6.8, 7.3, 6.2, 10.6, 7.0, 7.3, 8.2, 7.0, 6.5, 6.7,…
## $ INTG <dbl> 7.9, 8.9, 8.1, 8.8, 6.4, 8.8, 9.0, 5.9, 8.9, 7.9, 8.0, 8.0, 8.6, …
## $ DMNR <dbl> 7.7, 8.8, 7.8, 8.5, 4.3, 8.7, 8.9, 4.9, 8.9, 6.7, 7.6, 7.6, 8.2, …
## $ DILG <dbl> 7.3, 8.5, 7.8, 8.8, 6.5, 8.5, 8.7, 5.1, 8.7, 8.1, 7.4, 7.2, 6.8, …
## $ CFMG <dbl> 7.1, 7.8, 7.5, 8.3, 6.0, 7.9, 8.5, 5.4, 8.6, 7.9, 7.3, 7.0, 6.9, …
## $ DECI <dbl> 7.4, 8.1, 7.6, 8.5, 6.2, 8.0, 8.5, 5.9, 8.5, 8.0, 7.5, 7.1, 6.6, …
## $ PREP <dbl> 7.1, 8.0, 7.5, 8.7, 5.7, 8.1, 8.5, 4.8, 8.4, 7.9, 7.1, 6.9, 7.1, …
## $ FAMI <dbl> 7.1, 8.0, 7.5, 8.7, 5.7, 8.0, 8.5, 5.1, 8.4, 8.1, 7.2, 7.0, 7.3, …
## $ ORAL <dbl> 7.1, 7.8, 7.3, 8.4, 5.1, 8.0, 8.6, 4.7, 8.4, 7.7, 7.1, 7.0, 7.2, …
## $ WRIT <dbl> 7.0, 7.9, 7.4, 8.5, 5.3, 8.0, 8.4, 4.9, 8.5, 7.8, 7.2, 7.1, 7.2, …
## $ PHYS <dbl> 8.3, 8.5, 7.9, 8.8, 5.5, 8.6, 9.1, 6.8, 8.8, 8.5, 8.4, 6.9, 8.1, …
## $ RTEN <dbl> 7.8, 8.7, 7.8, 8.7, 4.8, 8.6, 9.0, 5.0, 8.8, 7.9, 7.7, 7.2, 7.7, …
USJR %>% colnames()
##  [1] "CONT" "INTG" "DMNR" "DILG" "CFMG" "DECI" "PREP" "FAMI" "ORAL" "WRIT"
## [11] "PHYS" "RTEN"
USJR 
##                 CONT INTG DMNR DILG CFMG DECI PREP FAMI ORAL WRIT PHYS RTEN
## AARONSON,L.H.    5.7  7.9  7.7  7.3  7.1  7.4  7.1  7.1  7.1  7.0  8.3  7.8
## ALEXANDER,J.M.   6.8  8.9  8.8  8.5  7.8  8.1  8.0  8.0  7.8  7.9  8.5  8.7
## ARMENTANO,A.J.   7.2  8.1  7.8  7.8  7.5  7.6  7.5  7.5  7.3  7.4  7.9  7.8
## BERDON,R.I.      6.8  8.8  8.5  8.8  8.3  8.5  8.7  8.7  8.4  8.5  8.8  8.7
## BRACKEN,J.J.     7.3  6.4  4.3  6.5  6.0  6.2  5.7  5.7  5.1  5.3  5.5  4.8
## BURNS,E.B.       6.2  8.8  8.7  8.5  7.9  8.0  8.1  8.0  8.0  8.0  8.6  8.6
## CALLAHAN,R.J.   10.6  9.0  8.9  8.7  8.5  8.5  8.5  8.5  8.6  8.4  9.1  9.0
## COHEN,S.S.       7.0  5.9  4.9  5.1  5.4  5.9  4.8  5.1  4.7  4.9  6.8  5.0
## DALY,J.J.        7.3  8.9  8.9  8.7  8.6  8.5  8.4  8.4  8.4  8.5  8.8  8.8
## DANNEHY,J.F.     8.2  7.9  6.7  8.1  7.9  8.0  7.9  8.1  7.7  7.8  8.5  7.9
## DEAN,H.H.        7.0  8.0  7.6  7.4  7.3  7.5  7.1  7.2  7.1  7.2  8.4  7.7
## DEVITA,H.J.      6.5  8.0  7.6  7.2  7.0  7.1  6.9  7.0  7.0  7.1  6.9  7.2
## DRISCOLL,P.J.    6.7  8.6  8.2  6.8  6.9  6.6  7.1  7.3  7.2  7.2  8.1  7.7
## GRILLO,A.E.      7.0  7.5  6.4  6.8  6.5  7.0  6.6  6.8  6.3  6.6  6.2  6.5
## HADDEN,W.L.JR.   6.5  8.1  8.0  8.0  7.9  8.0  7.9  7.8  7.8  7.8  8.4  8.0
## HAMILL,E.C.      7.3  8.0  7.4  7.7  7.3  7.3  7.3  7.2  7.1  7.2  8.0  7.6
## HEALEY.A.H.      8.0  7.6  6.6  7.2  6.5  6.5  6.8  6.7  6.4  6.5  6.9  6.7
## HULL,T.C.        7.7  7.7  6.7  7.5  7.4  7.5  7.1  7.3  7.1  7.3  8.1  7.4
## LEVINE,I.        8.3  8.2  7.4  7.8  7.7  7.7  7.7  7.8  7.5  7.6  8.0  8.0
## LEVISTER,R.L.    9.6  6.9  5.7  6.6  6.9  6.6  6.2  6.0  5.8  5.8  7.2  6.0
## MARTIN,L.F.      7.1  8.2  7.7  7.1  6.6  6.6  6.7  6.7  6.8  6.8  7.5  7.3
## MCGRATH,J.F.     7.6  7.3  6.9  6.8  6.7  6.8  6.4  6.3  6.3  6.3  7.4  6.6
## MIGNONE,A.F.     6.6  7.4  6.2  6.2  5.4  5.7  5.8  5.9  5.2  5.8  4.7  5.2
## MISSAL,H.M.      6.2  8.3  8.1  7.7  7.4  7.3  7.3  7.3  7.2  7.3  7.8  7.6
## MULVEY,H.M.      7.5  8.7  8.5  8.6  8.5  8.4  8.5  8.5  8.4  8.4  8.7  8.7
## NARUK,H.J.       7.8  8.9  8.7  8.9  8.7  8.8  8.9  9.0  8.8  8.9  9.0  9.0
## O'BRIEN,F.J.     7.1  8.5  8.3  8.0  7.9  7.9  7.8  7.8  7.8  7.7  8.3  8.2
## O'SULLIVAN,T.J.  7.5  9.0  8.9  8.7  8.4  8.5  8.4  8.3  8.3  8.3  8.8  8.7
## PASKEY,L.        7.5  8.1  7.7  8.2  8.0  8.1  8.2  8.4  8.0  8.1  8.4  8.1
## RUBINOW,J.E.     7.1  9.2  9.0  9.0  8.4  8.6  9.1  9.1  8.9  9.0  8.9  9.2
## SADEN.G.A.       6.6  7.4  6.9  8.4  8.0  7.9  8.2  8.4  7.7  7.9  8.4  7.5
## SATANIELLO,A.G.  8.4  8.0  7.9  7.9  7.8  7.8  7.6  7.4  7.4  7.4  8.1  7.9
## SHEA,D.M.        6.9  8.5  7.8  8.5  8.1  8.2  8.4  8.5  8.1  8.3  8.7  8.3
## SHEA,J.F.JR.     7.3  8.9  8.8  8.7  8.4  8.5  8.5  8.5  8.4  8.4  8.8  8.8
## SIDOR,W.J.       7.7  6.2  5.1  5.6  5.6  5.9  5.6  5.6  5.3  5.5  6.3  5.3
## SPEZIALE,J.A.    8.5  8.3  8.1  8.3  8.4  8.2  8.2  8.1  7.9  8.0  8.0  8.2
## SPONZO,M.J.      6.9  8.3  8.0  8.1  7.9  7.9  7.9  7.7  7.6  7.7  8.1  8.0
## STAPLETON,J.F.   6.5  8.2  7.7  7.8  7.6  7.7  7.7  7.7  7.5  7.6  8.5  7.7
## TESTO,R.J.       8.3  7.3  7.0  6.8  7.0  7.1  6.7  6.7  6.7  6.7  8.0  7.0
## TIERNEY,W.L.JR.  8.3  8.2  7.8  8.3  8.4  8.3  7.7  7.6  7.5  7.7  8.1  7.9
## WALL,R.A.        9.0  7.0  5.9  7.0  7.0  7.2  6.9  6.9  6.5  6.6  7.6  6.6
## WRIGHT,D.B.      7.1  8.4  8.4  7.7  7.5  7.7  7.8  8.2  8.0  8.1  8.3  8.1
## ZARRILLI,K.J.    8.6  7.4  7.0  7.5  7.5  7.7  7.4  7.2  6.9  7.0  7.8  7.1

Wee see that the lawyers’ names are used to lable the rows. We can extract these and use them later for analysis. Will we spot underperformers?

lawyers <- rownames(USJR)
lawyers
##  [1] "AARONSON,L.H."   "ALEXANDER,J.M."  "ARMENTANO,A.J."  "BERDON,R.I."    
##  [5] "BRACKEN,J.J."    "BURNS,E.B."      "CALLAHAN,R.J."   "COHEN,S.S."     
##  [9] "DALY,J.J."       "DANNEHY,J.F."    "DEAN,H.H."       "DEVITA,H.J."    
## [13] "DRISCOLL,P.J."   "GRILLO,A.E."     "HADDEN,W.L.JR."  "HAMILL,E.C."    
## [17] "HEALEY.A.H."     "HULL,T.C."       "LEVINE,I."       "LEVISTER,R.L."  
## [21] "MARTIN,L.F."     "MCGRATH,J.F."    "MIGNONE,A.F."    "MISSAL,H.M."    
## [25] "MULVEY,H.M."     "NARUK,H.J."      "O'BRIEN,F.J."    "O'SULLIVAN,T.J."
## [29] "PASKEY,L."       "RUBINOW,J.E."    "SADEN.G.A."      "SATANIELLO,A.G."
## [33] "SHEA,D.M."       "SHEA,J.F.JR."    "SIDOR,W.J."      "SPEZIALE,J.A."  
## [37] "SPONZO,M.J."     "STAPLETON,J.F."  "TESTO,R.J."      "TIERNEY,W.L.JR."
## [41] "WALL,R.A."       "WRIGHT,D.B."     "ZARRILLI,K.J."

(b) For all variables: try different instruments for outlier analysis as discussed in

lecture and identify potential outliers. - Boxplots

 # i = 1, 2, 3, ..., number of colums
for (i in 1:length(colnames(USJR))) {
  # index for matrices or data frames-> [rows, cols]
  # title for each boxplot
  boxplot(USJR[, i], main= as.character(colnames(USJR)[i]),col=12, alpha=0.5)
}

  • z-score \[ z = \frac{x - \bar{x}}{\sigma} = \frac{x-\mu}{\sigma} \]
for (i in 1:ncol(USJR)) {
  if (i==1) {
    x <- USJR[,i]
    x_bar <- mean(x)    # mean(USJR[, i])
    sigma <- sd(x)
    z <- (x-x_bar)/sigma
    # create data frame from zscore obtained
    z_scores <- data.frame(z)
  }
  else {
    x <- USJR[,i]
    x_bar <- mean(x)  
    sigma <- sd(x)
    z <- (x-x_bar)/sigma
    z_scores <- z_scores %>% cbind(z)
  }  
}
# renaming the column names by the corresponding variable
colnames(z_scores) <- colnames(USJR)
# adding back the lawyers' names to label the rows
rownames(z_scores) <- lawyers
#z_scores %>% View()

count the number of outlier ratings for each lawyer if \(-2.7<z<2.7\)

outlier_df <- ifelse(abs(z_scores) >= 2.8, 1, 0) %>% as.data.frame()
outlier_df
##                 CONT INTG DMNR DILG CFMG DECI PREP FAMI ORAL WRIT PHYS RTEN
## AARONSON,L.H.      0    0    0    0    0    0    0    0    0    0    0    0
## ALEXANDER,J.M.     0    0    0    0    0    0    0    0    0    0    0    0
## ARMENTANO,A.J.     0    0    0    0    0    0    0    0    0    0    0    0
## BERDON,R.I.        0    0    0    0    0    0    0    0    0    0    0    0
## BRACKEN,J.J.       0    0    1    0    0    0    0    0    0    0    0    0
## BURNS,E.B.         0    0    0    0    0    0    0    0    0    0    0    0
## CALLAHAN,R.J.      1    0    0    0    0    0    0    0    0    0    0    0
## COHEN,S.S.         0    0    0    1    0    0    0    0    0    0    0    0
## DALY,J.J.          0    0    0    0    0    0    0    0    0    0    0    0
## DANNEHY,J.F.       0    0    0    0    0    0    0    0    0    0    0    0
## DEAN,H.H.          0    0    0    0    0    0    0    0    0    0    0    0
## DEVITA,H.J.        0    0    0    0    0    0    0    0    0    0    0    0
## DRISCOLL,P.J.      0    0    0    0    0    0    0    0    0    0    0    0
## GRILLO,A.E.        0    0    0    0    0    0    0    0    0    0    0    0
## HADDEN,W.L.JR.     0    0    0    0    0    0    0    0    0    0    0    0
## HAMILL,E.C.        0    0    0    0    0    0    0    0    0    0    0    0
## HEALEY.A.H.        0    0    0    0    0    0    0    0    0    0    0    0
## HULL,T.C.          0    0    0    0    0    0    0    0    0    0    0    0
## LEVINE,I.          0    0    0    0    0    0    0    0    0    0    0    0
## LEVISTER,R.L.      0    0    0    0    0    0    0    0    0    0    0    0
## MARTIN,L.F.        0    0    0    0    0    0    0    0    0    0    0    0
## MCGRATH,J.F.       0    0    0    0    0    0    0    0    0    0    0    0
## MIGNONE,A.F.       0    0    0    0    0    0    0    0    0    0    1    0
## MISSAL,H.M.        0    0    0    0    0    0    0    0    0    0    0    0
## MULVEY,H.M.        0    0    0    0    0    0    0    0    0    0    0    0
## NARUK,H.J.         0    0    0    0    0    0    0    0    0    0    0    0
## O'BRIEN,F.J.       0    0    0    0    0    0    0    0    0    0    0    0
## O'SULLIVAN,T.J.    0    0    0    0    0    0    0    0    0    0    0    0
## PASKEY,L.          0    0    0    0    0    0    0    0    0    0    0    0
## RUBINOW,J.E.       0    0    0    0    0    0    0    0    0    0    0    0
## SADEN.G.A.         0    0    0    0    0    0    0    0    0    0    0    0
## SATANIELLO,A.G.    0    0    0    0    0    0    0    0    0    0    0    0
## SHEA,D.M.          0    0    0    0    0    0    0    0    0    0    0    0
## SHEA,J.F.JR.       0    0    0    0    0    0    0    0    0    0    0    0
## SIDOR,W.J.         0    0    0    0    0    0    0    0    0    0    0    0
## SPEZIALE,J.A.      0    0    0    0    0    0    0    0    0    0    0    0
## SPONZO,M.J.        0    0    0    0    0    0    0    0    0    0    0    0
## STAPLETON,J.F.     0    0    0    0    0    0    0    0    0    0    0    0
## TESTO,R.J.         0    0    0    0    0    0    0    0    0    0    0    0
## TIERNEY,W.L.JR.    0    0    0    0    0    0    0    0    0    0    0    0
## WALL,R.A.          0    0    0    0    0    0    0    0    0    0    0    0
## WRIGHT,D.B.        0    0    0    0    0    0    0    0    0    0    0    0
## ZARRILLI,K.J.      0    0    0    0    0    0    0    0    0    0    0    0
#rsums <- outlier_df %>% rowSums() %>% as.vector()
#outlier_df <- outlier_df %>% cbind(rsums, lawyers) %>% as.data.frame()

outlier_df <- outlier_df %>% mutate(outlier_df,
                      rsums = (CONT + INTG + DMNR + DILG + CFMG + DECI + PREP + 
                                 FAMI + ORAL + WRIT +PHYS + RTEN))
outlier_df
##                 CONT INTG DMNR DILG CFMG DECI PREP FAMI ORAL WRIT PHYS RTEN
## AARONSON,L.H.      0    0    0    0    0    0    0    0    0    0    0    0
## ALEXANDER,J.M.     0    0    0    0    0    0    0    0    0    0    0    0
## ARMENTANO,A.J.     0    0    0    0    0    0    0    0    0    0    0    0
## BERDON,R.I.        0    0    0    0    0    0    0    0    0    0    0    0
## BRACKEN,J.J.       0    0    1    0    0    0    0    0    0    0    0    0
## BURNS,E.B.         0    0    0    0    0    0    0    0    0    0    0    0
## CALLAHAN,R.J.      1    0    0    0    0    0    0    0    0    0    0    0
## COHEN,S.S.         0    0    0    1    0    0    0    0    0    0    0    0
## DALY,J.J.          0    0    0    0    0    0    0    0    0    0    0    0
## DANNEHY,J.F.       0    0    0    0    0    0    0    0    0    0    0    0
## DEAN,H.H.          0    0    0    0    0    0    0    0    0    0    0    0
## DEVITA,H.J.        0    0    0    0    0    0    0    0    0    0    0    0
## DRISCOLL,P.J.      0    0    0    0    0    0    0    0    0    0    0    0
## GRILLO,A.E.        0    0    0    0    0    0    0    0    0    0    0    0
## HADDEN,W.L.JR.     0    0    0    0    0    0    0    0    0    0    0    0
## HAMILL,E.C.        0    0    0    0    0    0    0    0    0    0    0    0
## HEALEY.A.H.        0    0    0    0    0    0    0    0    0    0    0    0
## HULL,T.C.          0    0    0    0    0    0    0    0    0    0    0    0
## LEVINE,I.          0    0    0    0    0    0    0    0    0    0    0    0
## LEVISTER,R.L.      0    0    0    0    0    0    0    0    0    0    0    0
## MARTIN,L.F.        0    0    0    0    0    0    0    0    0    0    0    0
## MCGRATH,J.F.       0    0    0    0    0    0    0    0    0    0    0    0
## MIGNONE,A.F.       0    0    0    0    0    0    0    0    0    0    1    0
## MISSAL,H.M.        0    0    0    0    0    0    0    0    0    0    0    0
## MULVEY,H.M.        0    0    0    0    0    0    0    0    0    0    0    0
## NARUK,H.J.         0    0    0    0    0    0    0    0    0    0    0    0
## O'BRIEN,F.J.       0    0    0    0    0    0    0    0    0    0    0    0
## O'SULLIVAN,T.J.    0    0    0    0    0    0    0    0    0    0    0    0
## PASKEY,L.          0    0    0    0    0    0    0    0    0    0    0    0
## RUBINOW,J.E.       0    0    0    0    0    0    0    0    0    0    0    0
## SADEN.G.A.         0    0    0    0    0    0    0    0    0    0    0    0
## SATANIELLO,A.G.    0    0    0    0    0    0    0    0    0    0    0    0
## SHEA,D.M.          0    0    0    0    0    0    0    0    0    0    0    0
## SHEA,J.F.JR.       0    0    0    0    0    0    0    0    0    0    0    0
## SIDOR,W.J.         0    0    0    0    0    0    0    0    0    0    0    0
## SPEZIALE,J.A.      0    0    0    0    0    0    0    0    0    0    0    0
## SPONZO,M.J.        0    0    0    0    0    0    0    0    0    0    0    0
## STAPLETON,J.F.     0    0    0    0    0    0    0    0    0    0    0    0
## TESTO,R.J.         0    0    0    0    0    0    0    0    0    0    0    0
## TIERNEY,W.L.JR.    0    0    0    0    0    0    0    0    0    0    0    0
## WALL,R.A.          0    0    0    0    0    0    0    0    0    0    0    0
## WRIGHT,D.B.        0    0    0    0    0    0    0    0    0    0    0    0
## ZARRILLI,K.J.      0    0    0    0    0    0    0    0    0    0    0    0
##                 rsums
## AARONSON,L.H.       0
## ALEXANDER,J.M.      0
## ARMENTANO,A.J.      0
## BERDON,R.I.         0
## BRACKEN,J.J.        1
## BURNS,E.B.          0
## CALLAHAN,R.J.       1
## COHEN,S.S.          1
## DALY,J.J.           0
## DANNEHY,J.F.        0
## DEAN,H.H.           0
## DEVITA,H.J.         0
## DRISCOLL,P.J.       0
## GRILLO,A.E.         0
## HADDEN,W.L.JR.      0
## HAMILL,E.C.         0
## HEALEY.A.H.         0
## HULL,T.C.           0
## LEVINE,I.           0
## LEVISTER,R.L.       0
## MARTIN,L.F.         0
## MCGRATH,J.F.        0
## MIGNONE,A.F.        1
## MISSAL,H.M.         0
## MULVEY,H.M.         0
## NARUK,H.J.          0
## O'BRIEN,F.J.        0
## O'SULLIVAN,T.J.     0
## PASKEY,L.           0
## RUBINOW,J.E.        0
## SADEN.G.A.          0
## SATANIELLO,A.G.     0
## SHEA,D.M.           0
## SHEA,J.F.JR.        0
## SIDOR,W.J.          0
## SPEZIALE,J.A.       0
## SPONZO,M.J.         0
## STAPLETON,J.F.      0
## TESTO,R.J.          0
## TIERNEY,W.L.JR.     0
## WALL,R.A.           0
## WRIGHT,D.B.         0
## ZARRILLI,K.J.       0
# base R barplot
barplot(rowSums(outlier_df), cex.axis=2, las=2, col=2)

# ggplot
ggplot(data=outlier_df, aes(x=lawyers)) +
  geom_bar(aes(weight=rsums), fill='purple', alpha=0.6) +
  theme(axis.text.x = element_text(angle = 90, hjust=1))

outlier_df %>% summarize(lawyers, rsums) %>% arrange(-rsums) %>% head(10)
##           lawyers rsums
## 1    BRACKEN,J.J.     1
## 2   CALLAHAN,R.J.     1
## 3      COHEN,S.S.     1
## 4    MIGNONE,A.F.     1
## 5   AARONSON,L.H.     0
## 6  ALEXANDER,J.M.     0
## 7  ARMENTANO,A.J.     0
## 8     BERDON,R.I.     0
## 9      BURNS,E.B.     0
## 10      DALY,J.J.     0
outliers <- boxplot(USJudgeRatings, plot = FALSE)$out

# Print the indices of the outliers
cat("Indices of potential outliers:", which(apply(USJudgeRatings, 2, function(x) x %in% outliers)), "\n")
## Indices of potential outliers: 6 7 20 24 51 78 91 94 109 121 127 137 152 180 195 220 223 250 266 278 309 323 324 349 352 358 366 367 379 392 395 409 422 435 444 453 465 478 481 496 508

outlier clustering with dbscan

library(dbscan)
## 
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
dist_mtx <- dist(USJR)
# perform DBSCAN clustering
cluster <- dbscan(dist_mtx, eps = 1, minPts = 5)

# print the outlier indices
outlier_indices <- which(cluster$cluster == 0)
print(outlier_indices)
##  [1]  1  2  5  6  7  8 10 11 12 13 14 16 17 18 19 20 21 22 23 24 26 29 30 31 32
## [26] 33 35 36 39 40 41 42 43
  • skewness & kurtosis
# initiating empty vectors to store values
s <- c()
k <- c()
for (i in 1:length(colnames(USJR))) {
  skew <- skewness(USJR[, i]); s <- append(s, skew);
  kurt <- kurtosis(USJR[, i]); k <- append(k, kurt);
}

#USJR <- USJR %>% rbind(s, k)
USJR
##                 CONT INTG DMNR DILG CFMG DECI PREP FAMI ORAL WRIT PHYS RTEN
## AARONSON,L.H.    5.7  7.9  7.7  7.3  7.1  7.4  7.1  7.1  7.1  7.0  8.3  7.8
## ALEXANDER,J.M.   6.8  8.9  8.8  8.5  7.8  8.1  8.0  8.0  7.8  7.9  8.5  8.7
## ARMENTANO,A.J.   7.2  8.1  7.8  7.8  7.5  7.6  7.5  7.5  7.3  7.4  7.9  7.8
## BERDON,R.I.      6.8  8.8  8.5  8.8  8.3  8.5  8.7  8.7  8.4  8.5  8.8  8.7
## BRACKEN,J.J.     7.3  6.4  4.3  6.5  6.0  6.2  5.7  5.7  5.1  5.3  5.5  4.8
## BURNS,E.B.       6.2  8.8  8.7  8.5  7.9  8.0  8.1  8.0  8.0  8.0  8.6  8.6
## CALLAHAN,R.J.   10.6  9.0  8.9  8.7  8.5  8.5  8.5  8.5  8.6  8.4  9.1  9.0
## COHEN,S.S.       7.0  5.9  4.9  5.1  5.4  5.9  4.8  5.1  4.7  4.9  6.8  5.0
## DALY,J.J.        7.3  8.9  8.9  8.7  8.6  8.5  8.4  8.4  8.4  8.5  8.8  8.8
## DANNEHY,J.F.     8.2  7.9  6.7  8.1  7.9  8.0  7.9  8.1  7.7  7.8  8.5  7.9
## DEAN,H.H.        7.0  8.0  7.6  7.4  7.3  7.5  7.1  7.2  7.1  7.2  8.4  7.7
## DEVITA,H.J.      6.5  8.0  7.6  7.2  7.0  7.1  6.9  7.0  7.0  7.1  6.9  7.2
## DRISCOLL,P.J.    6.7  8.6  8.2  6.8  6.9  6.6  7.1  7.3  7.2  7.2  8.1  7.7
## GRILLO,A.E.      7.0  7.5  6.4  6.8  6.5  7.0  6.6  6.8  6.3  6.6  6.2  6.5
## HADDEN,W.L.JR.   6.5  8.1  8.0  8.0  7.9  8.0  7.9  7.8  7.8  7.8  8.4  8.0
## HAMILL,E.C.      7.3  8.0  7.4  7.7  7.3  7.3  7.3  7.2  7.1  7.2  8.0  7.6
## HEALEY.A.H.      8.0  7.6  6.6  7.2  6.5  6.5  6.8  6.7  6.4  6.5  6.9  6.7
## HULL,T.C.        7.7  7.7  6.7  7.5  7.4  7.5  7.1  7.3  7.1  7.3  8.1  7.4
## LEVINE,I.        8.3  8.2  7.4  7.8  7.7  7.7  7.7  7.8  7.5  7.6  8.0  8.0
## LEVISTER,R.L.    9.6  6.9  5.7  6.6  6.9  6.6  6.2  6.0  5.8  5.8  7.2  6.0
## MARTIN,L.F.      7.1  8.2  7.7  7.1  6.6  6.6  6.7  6.7  6.8  6.8  7.5  7.3
## MCGRATH,J.F.     7.6  7.3  6.9  6.8  6.7  6.8  6.4  6.3  6.3  6.3  7.4  6.6
## MIGNONE,A.F.     6.6  7.4  6.2  6.2  5.4  5.7  5.8  5.9  5.2  5.8  4.7  5.2
## MISSAL,H.M.      6.2  8.3  8.1  7.7  7.4  7.3  7.3  7.3  7.2  7.3  7.8  7.6
## MULVEY,H.M.      7.5  8.7  8.5  8.6  8.5  8.4  8.5  8.5  8.4  8.4  8.7  8.7
## NARUK,H.J.       7.8  8.9  8.7  8.9  8.7  8.8  8.9  9.0  8.8  8.9  9.0  9.0
## O'BRIEN,F.J.     7.1  8.5  8.3  8.0  7.9  7.9  7.8  7.8  7.8  7.7  8.3  8.2
## O'SULLIVAN,T.J.  7.5  9.0  8.9  8.7  8.4  8.5  8.4  8.3  8.3  8.3  8.8  8.7
## PASKEY,L.        7.5  8.1  7.7  8.2  8.0  8.1  8.2  8.4  8.0  8.1  8.4  8.1
## RUBINOW,J.E.     7.1  9.2  9.0  9.0  8.4  8.6  9.1  9.1  8.9  9.0  8.9  9.2
## SADEN.G.A.       6.6  7.4  6.9  8.4  8.0  7.9  8.2  8.4  7.7  7.9  8.4  7.5
## SATANIELLO,A.G.  8.4  8.0  7.9  7.9  7.8  7.8  7.6  7.4  7.4  7.4  8.1  7.9
## SHEA,D.M.        6.9  8.5  7.8  8.5  8.1  8.2  8.4  8.5  8.1  8.3  8.7  8.3
## SHEA,J.F.JR.     7.3  8.9  8.8  8.7  8.4  8.5  8.5  8.5  8.4  8.4  8.8  8.8
## SIDOR,W.J.       7.7  6.2  5.1  5.6  5.6  5.9  5.6  5.6  5.3  5.5  6.3  5.3
## SPEZIALE,J.A.    8.5  8.3  8.1  8.3  8.4  8.2  8.2  8.1  7.9  8.0  8.0  8.2
## SPONZO,M.J.      6.9  8.3  8.0  8.1  7.9  7.9  7.9  7.7  7.6  7.7  8.1  8.0
## STAPLETON,J.F.   6.5  8.2  7.7  7.8  7.6  7.7  7.7  7.7  7.5  7.6  8.5  7.7
## TESTO,R.J.       8.3  7.3  7.0  6.8  7.0  7.1  6.7  6.7  6.7  6.7  8.0  7.0
## TIERNEY,W.L.JR.  8.3  8.2  7.8  8.3  8.4  8.3  7.7  7.6  7.5  7.7  8.1  7.9
## WALL,R.A.        9.0  7.0  5.9  7.0  7.0  7.2  6.9  6.9  6.5  6.6  7.6  6.6
## WRIGHT,D.B.      7.1  8.4  8.4  7.7  7.5  7.7  7.8  8.2  8.0  8.1  8.3  8.1
## ZARRILLI,K.J.    8.6  7.4  7.0  7.5  7.5  7.7  7.4  7.2  6.9  7.0  7.8  7.1
par(mfrow = c(3, 4)) 
for (i in 1:ncol(USJR)) { 
  densityPlot <- density(USJR[, i])
  mu <- mean(USJR[, i])
  plot(densityPlot, main = colnames(USJR)[i], col=2) 
  abline(v=mu, col='blue')
}

2

1.2 Outlier analysis II

(a) Choose any company from the Austrian Traded Index (ATX) and download

the stock prices for this company in 2022 (daily basis).

RBI <- read_csv("https://raw.githubusercontent.com/MarkusStefan/Data_Analytics/main/Exercise1/RBI.VI.csv")
## Rows: 4485 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (6): Open, High, Low, Close, Adj Close, Volume
## date (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# creating columns for month and year
RBI <- RBI %>% mutate(month = months(RBI$Date),
                      year = year(RBI$Date))

(b) Based on the data from (a), calculate daily revenues of the stock prices.

RBI <- RBI %>% mutate(DailyRev =
  (RBI$`Adj Close` - RBI$Open) * RBI$Volume)

ggplot(data=RBI, aes(x=Date, y=DailyRev)) +
  geom_line(col='dark blue')

(c) Try different instruments for outlier analysis as discussed in lecture and identify

potential outliers. - Standardizing the data (z-scores)

RBI <- RBI %>% mutate(DailyRevNorm =
  (DailyRev-mean(DailyRev))/sd(DailyRev))

ggplot(data=RBI, aes(x=Date, y=DailyRevNorm)) +
  geom_line(col='dark blue')

removing outliers by z-scores

library(lubridate)
outliers_RBI <- RBI[abs(RBI$DailyRevNorm) > 3, ] # RBI %>% filter(DailyRevNorm > 5 | DailyRevNorm < 5) 
m <- months(outliers_RBI$Date)
y <- year(outliers_RBI$Date)
outliers_RBI
## # A tibble: 89 × 11
##    Date        Open  High   Low Close Adj C…¹ Volume month  year Daily…² Daily…³
##    <date>     <dbl> <dbl> <dbl> <dbl>   <dbl>  <dbl> <chr> <dbl>   <dbl>   <dbl>
##  1 2005-04-25  37.3  39.9  37.3  39.6    20.0 7.45e6 April  2005 -1.29e8  -19.0 
##  2 2005-04-26  39.6  39.6  38.4  38.8    19.6 1.42e6 April  2005 -2.83e7   -3.53
##  3 2005-04-27  38.4  38.6  38.3  38.4    19.4 1.63e6 April  2005 -3.10e7   -3.95
##  4 2006-02-28  68.6  68.9  66.4  67.0    33.9 1.86e6 Febr…  2006 -6.46e7   -9.12
##  5 2006-05-22  66.1  66.1  56.5  56.5    28.6 9.18e5 May    2006 -3.44e7   -4.48
##  6 2006-05-23  57.6  63.5  57.6  62.6    31.6 1.13e6 May    2006 -2.93e7   -3.70
##  7 2006-06-08  61.3  61.8  59.6  60.3    30.5 9.59e5 June   2006 -2.95e7   -3.73
##  8 2007-01-08 108.  108.  103.  105.     53.6 7.56e5 Janu…  2007 -4.14e7   -5.55
##  9 2007-02-27 106.  106.   98.2  98.2    50.0 9.42e5 Febr…  2007 -5.26e7   -7.28
## 10 2007-02-28  89.1  99.5  88.1  97.4    49.6 8.91e5 Febr…  2007 -3.52e7   -4.59
## # … with 79 more rows, and abbreviated variable names ¹​`Adj Close`, ²​DailyRev,
## #   ³​DailyRevNorm
barplot(table(m), las=2, col=3)

barplot(table(y), col="pink")

  • Outlier analysis with interquantile range

option to spot outliers with boxplot()

boxplot(RBI$DailyRev)$out

##   [1] -129074750  -28288247  -31011464  -13057319  -13225256  -12283425
##   [7]  -12438510  -14405735  -14730996  -13563444  -14144867  -13228900
##  [13]  -14685421  -16900141  -17367965  -12114524  -19692963  -15782094
##  [19]  -14866920  -12613888  -64601702  -15005763  -12579335  -12846175
##  [25]  -20700257  -13241903  -15430144  -17431249  -13767322  -18833545
##  [31]  -19587314  -20112003  -15001034  -19348913  -19932006  -34431853
##  [37]  -29336773  -22711940  -19124261  -13979831  -17381912  -12376512
##  [43]  -13234784  -29539156  -12987009  -22543823  -18061791  -12422268
##  [49]  -15959999  -18660325  -18203786  -13442049  -15074129  -12238741
##  [55]  -13497191  -17914249  -13465363  -13695796  -16290995  -12479647
##  [61]  -12368022  -18483396  -24036889  -24786368  -24785533  -13741164
##  [67]  -15102844  -14573298  -23290236  -12308724  -12209884  -16287063
##  [73]  -21995843  -41416037  -23323775  -15061830  -16341171  -17142398
##  [79]  -14457131  -12194801  -12543620  -13950902  -17932176  -15375077
##  [85]  -52598388  -35174358  -27155848  -16330459  -28508474  -13040246
##  [91]  -26203584  -20435637  -14751098  -22645304  -17673491  -25907245
##  [97]  -15297639  -13180873  -14048164  -14226979  -26085966  -12519471
## [103]  -14098858  -13456119  -12787608  -14662154  -18640542  -17306006
## [109]  -18894652  -17287905  -17195844  -16129426  -13748119  -18237145
## [115]  -19207953  -29947103  -13818184  -31748625  -36223591  -13450747
## [121]  -17082614  -40923916  -25137593  -12253291  -13761394  -23776117
## [127]  -26464784  -14346593  -27374072  -18342619  -24172872  -33160824
## [133]  -18361023  -23090335  -16499626  -18115527  -22899928  -24328073
## [139]  -14029240  -16553322  -13252918  -15422689  -15874294  -15033538
## [145]  -13932170  -14252721  -14034882  -14509180  -23749601  -19538535
## [151]  -17389858  -13267782  -12384916  -19251436  -12736969  -22603814
## [157]  -37609605  -29711803  -21721699  -41827406  -57920846  -23561061
## [163]  -12344702  -13853310  -14142403  -16204006  -15433024  -13752508
## [169]  -19477078  -17168568  -16732015  -15119369  -14029752  -27891830
## [175]  -13947503  -22959526  -19995471  -25027550  -16471197  -26790594
## [181]  -28598505  -61495978  -34261135  -17465032  -14820541  -20085213
## [187]  -17356350  -16125767  -12883101  -15534498  -15488827  -20251074
## [193]  -17770289  -20581707  -30355892  -22744576  -14430166  -13135976
## [199]  -13313342  -20838940  -14966050  -22635387  -34005454  -23218115
## [205]  -20879587  -14373804  -18427344  -13453671  -26313051  -22506763
## [211]  -17287251  -18125684  -15028421  -13166284  -14011018  -14510536
## [217]  -12181995  -14418627  -12667876  -14654435  -21276634  -18807763
## [223]  -16627286  -35896482  -19939193  -38321560  -26842460  -21391910
## [229]  -31008546  -27064728  -21330785  -25170135  -27314612  -36463305
## [235]  -27939493  -25754726  -13574441  -16668449  -21973572  -19901501
## [241]  -17481750  -14761275  -18393312  -15928474  -12371537  -19696256
## [247]  -22634046  -18483505  -15211204  -16857100  -15574940  -35811437
## [253]  -29593964  -23345585  -59681005  -24207759  -35127333  -24585848
## [259]  -15093792  -13510179  -14245385  -30363350  -21647011  -18406427
## [265]  -25280718  -16694454  -15706419  -13855999  -20709696  -19933432
## [271]  -15610560  -17091012  -14317396  -40949418  -29920223  -16129843
## [277]  -12526243  -19066482  -12147111  -19578257  -14199551  -14602882
## [283]  -18382416  -17027891  -18820804  -15418931  -13573159  -16955433
## [289]  -20533090  -39226369  -32461708  -17816711  -20102270  -14889283
## [295]  -60113061  -15617312  -18240480  -15324198  -15655317  -23364056
## [301]  -12970807  -21942427  -16547946  -19757915  -13099864  -14769805
## [307]  -12989053  -18672712  -15000216  -14798086  -18219696  -15554677
## [313]  -16118614  -13471155  -16116226  -18841627  -14612056  -13542054
## [319]  -17160462  -17810170  -26905650  -17924793  -31353144  -22087724
## [325]  -17448694  -14508487  -15077593  -14234642  -18748717  -15739272
## [331]  -27835243  -30730150  -24808842  -26407928  -15314180  -13912734
## [337]  -21458040  -14727583  -18601521  -14239270  -14433164  -12267962
## [343]  -16110419  -31334228  -23157312  -17303462  -16404794  -13769975
## [349]  -28509417  -13198965  -12277363  -19329465  -22561259  -14362152
## [355]  -14901149  -38229628  -25837708  -22715318  -45815955  -60252235
## [361]  -46878395  -66673555  -17401111  -27101820  -13855716  -17560890
## [367]  -20776772  -37041516  -27139561  -26198161  -26758741  -17222398
## [373]  -17871200  -33075526  -45292806  -30738419  -18401678  -53743770
## [379]  -37111349  -37859792  -24209564  -16202627  -12316678  -13939625
## [385]  -13154578  -15148029  -20272461  -15262680  -14842008  -21068872
## [391]  -33215762  -14880294  -15470824  -17545932  -19707413  -17173873
## [397]  -17670485  -13604429  -13073495  -12111150  -12902470  -15655081
## [403]  -14709049  -46698201  -32045478  -19099575  -22103925  -13023435
## [409]  -21744505  -12110475  -23072084  -18168188  -18562748  -13667872
## [415]  -13294176  -16838141  -15427201  -13018389  -14429391  -44261890
## [421]  -37790525  -14175200  -12932614  -13129797  -13704791  -18682254
## [427]  -25872472  -18398747  -14280919  -18953733  -13623254  -13717891
## [433]  -13454238  -12257898  -15000882  -12859594  -23441442  -13428714
## [439]  -13665917  -12371794  -13551189  -18015551  -26448393  -13293590
## [445]  -13706998  -13460674  -15041022  -33039262  -13256939  -20262386
## [451]  -14023370  -12892207  -12357426  -16603473

function for computing the values of Q1 and Q3

q1q3 <- function(x) {
  sorted <- sort(x, decreasing=F)
  # n observations
  n <- length(sorted)
  
  # index of median
  # ceiling() chooses the upper value if remainder results from floor division
  median_index <- ceiling(n / 2)
  
  # first quartile
  q1_index <- ceiling(median_index / 2)
  q1 <- ifelse(median_index %% 2 == 0, 
               (sorted[q1_index] + sorted[q1_index + 1]) / 2, 
               # else
               sorted[q1_index])
  
  # third quartile
  # -1 such that indices are not skewed
  q3_index <- median_index + q1_index - 1
  q3 <- ifelse(median_index %% 2 == 0, 
               (sorted[q3_index] + sorted[q3_index + 1]) / 2, 
               # else
               sorted[q3_index])
  
 t# return results for q1 & q3
  return(list(q1 = q1, q3 = q3))
}

q1q3(RBI$DailyRev)
## $q1
## [1] -5997934
## 
## $q3
## [1] -1931826
q3 <- quantile(sort(RBI$DailyRev), 0.75)
q1 <- quantile(sort(RBI$DailyRev), 0.25)
q1_q3 <- IQR(sort(RBI$DailyRev))

# reomoving values if they fall out of interquartile range by 1.5 times the interquartile range
# ! negates the logical expression, selecting every row where the condition does NOT apply
no_outliers <- RBI[!RBI$DailyRev < (q1-1.5*q1_q3) | RBI$DailyRev > (q3+1.5*q1_q3), ]
no_outliers
## # A tibble: 4,031 × 11
##    Date        Open  High   Low Close Adj C…¹ Volume month  year Daily…² Daily…³
##    <date>     <dbl> <dbl> <dbl> <dbl>   <dbl>  <dbl> <chr> <dbl>   <dbl>   <dbl>
##  1 2005-05-02  38.4  38.9  38.2  38.5    19.4 354371 May    2005 -6.74e6  -0.216
##  2 2005-05-03  38.5  38.6  38.1  38.3    19.4 236774 May    2005 -4.53e6   0.123
##  3 2005-05-04  38.3  38.3  38.1  38.2    19.3 190578 May    2005 -3.62e6   0.263
##  4 2005-05-05  38.2  38.2  38.2  38.2    19.3      0 May    2005  0        0.821
##  5 2005-05-09  37.7  38.1  37.4  37.6    19.0 223719 May    2005 -4.19e6   0.177
##  6 2005-05-10  37.7  38.5  37.6  38.5    19.4 327572 May    2005 -5.99e6  -0.101
##  7 2005-05-11  38.4  39.6  38.4  38.8    19.6 358542 May    2005 -6.73e6  -0.215
##  8 2005-05-12  39.3  40.8  39.3  40.8    20.6 384865 May    2005 -7.18e6  -0.284
##  9 2005-05-13  40.2  40.8  39.9  40.6    20.5 354108 May    2005 -6.98e6  -0.253
## 10 2005-05-16  40.6  40.6  40.6  40.6    20.5      0 May    2005  0        0.821
## # … with 4,021 more rows, and abbreviated variable names ¹​`Adj Close`,
## #   ²​DailyRev, ³​DailyRevNorm
ggplot(data=no_outliers, aes(x=Date, y=DailyRev, col=factor(year))) +
  geom_line() +
  theme_minimal()

  #scale_color_brewer(palette="Spectral")
ggplot(data=no_outliers, aes(x=Date, y=DailyRevNorm, color=month)) +
  geom_line() +
  facet_wrap(~month) #+ scale_color_brewer(palette="Spectral")

outliers per month

# binary classification, whether revenue is an outlier or not
RBI <- RBI %>% mutate(outlier=ifelse(abs(DailyRevNorm) > 3, 1, 0))
RBI %>% 
    dplyr::group_by(month) %>% 
    drop_na() %>% 
    summarize(total_outliers = sum(outlier)) %>% 
    arrange(-total_outliers)
## # A tibble: 12 × 2
##    month     total_outliers
##    <chr>              <dbl>
##  1 January               13
##  2 October               13
##  3 May                   11
##  4 September             11
##  5 March                 10
##  6 February               8
##  7 April                  7
##  8 August                 6
##  9 July                   5
## 10 November               3
## 11 June                   2
## 12 December               0

time, when most outliers occured

RBI <- RBI %>% mutate(MY=paste(month, year, sep='/'))
RBI %>% 
    dplyr::group_by(MY) %>% 
    summarize(total_outliers = sum(outlier)) %>% 
    arrange(-total_outliers) %>% 
    head(n=10)
## # A tibble: 10 × 2
##    MY             total_outliers
##    <chr>                   <dbl>
##  1 January/2008               10
##  2 September/2008              9
##  3 October/2008                8
##  4 May/2007                    7
##  5 July/2008                   5
##  6 March/2007                  5
##  7 August/2007                 4
##  8 October/2007                4
##  9 April/2005                  3
## 10 April/2008                  3

3

1.3 Prepare a brief country profile of your country of origin using the data from The World Factbook of CIA and compare the data of ‘your’ country with the data for Austria. (If your country of origin is Austria you can take any other country for comparison). The country profile should include:

#CIA <- read_csv("export.csv", sep=';')
#CIA
countries <- c("AT", "DE")
pop <- c(9000000, 79000000)
public_debt <- c( 99.91, 63.90)
unrate <- c(6.3, 3.54)
gdp <- c()


comparison <- data.frame(countries, pop, unrate)
comparison
##   countries     pop unrate
## 1        AT 9.0e+06   6.30
## 2        DE 7.9e+07   3.54
#barplot(data=comparison[, 2])
ggplot(comparison, aes(x = countries, y = pop)) +
  geom_bar(stat = "identity", fill = "blue") +
  labs(x = "Country", y = "Population Size") +
  ggtitle("Population Sizes of Germany and Austria")

(a) Four general statistics (plots): population, GDP, unemployment, public debt.

(b) One statistics which you choose on your own.

#import pandas as pd
#import numpy as np
#import matplotlib.pyplot as plt
#import requests
#url = "https://www.imf.org/external/datamapper/api/v1/NGDP_RPCH/AUT/DEU?year=2023&yaxis=lin"
#response = requests.get(url)
#data = response.json()
#df = pd.read_json(url)
#data
def loop(msg):
  dic = {}
  for l in msg:
    if l not in dic:
      dic[l] = 1
    else:
      dic[l] += 1
  ls = []
  for i in range(len(dic)):
    ls.append(i)
  print(msg)
  return dic, ls


loop('hello world!')
## hello world!
## ({'h': 1, 'e': 1, 'l': 3, 'o': 2, ' ': 1, 'w': 1, 'r': 1, 'd': 1, '!': 1}, [0, 1, 2, 3, 4, 5, 6, 7, 8])

(c) All data should be presented in an appropriate way (as discussed in the lecture) to allow for a comparison of two countries.

4

1.4 Data preparation and visualization

(a) Download happiness scores from the World Happiness Report (the Gallup World Poll) from www.ourworldindata.org (happiness-cantril-ladder.csv). The main life evaluation question asked in the poll is: “Please imagine a ladder, with steps numbered from 0 at the bottom to 10 at the top. The top of the ladder represents the best possible life for you and the bottom of the ladder represents the worst possible life for you. On which step of the ladder would you say you personally feel you stand at this time?” (Also known as the “Cantril Ladder”.)

#happy <- read_csv("https://raw.githubusercontent.com/etadpu/open_source_data/main/happiness-cantril-ladder.csv")
happy <- read_csv("https://raw.githubusercontent.com/MarkusStefan/Data_Analytics/main/Exercise1/happiness-cantril-ladder.csv")
## Rows: 2381 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, Cantril ladder score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# clean column names 
colnames(happy) <- make_clean_names(colnames(happy))
happy
## # A tibble: 2,381 × 4
##    entity      code   year cantril_ladder_score
##    <chr>       <chr> <dbl>                <dbl>
##  1 Afghanistan AFG    2006                 3.72
##  2 Afghanistan AFG    2007                 4.40
##  3 Afghanistan AFG    2008                 4.76
##  4 Afghanistan AFG    2009                 3.83
##  5 Afghanistan AFG    2010                 3.78
##  6 Afghanistan AFG    2011                 3.57
##  7 Afghanistan AFG    2012                 3.13
##  8 Afghanistan AFG    2013                 3.98
##  9 Afghanistan AFG    2014                 4.22
## 10 Afghanistan AFG    2015                 2.66
## # … with 2,371 more rows

(b) The imported data should consist of 4 columns. Find and present 5 countries with the highest/lowest happiness in 2010 and in 2020.

# extracting relevant years form panel data
# remove columns 2 & 3 s.t. only the country and the happyness score remain
# sorting all columns by happyness score in dscending order (-)
#h2010 <- happy[happy$year == 2010 , ] %>% arrange(-cantril_ladder_score) 
#h2020 <- happy[happy$year == 2020, ] %>% arrange(-cantril_ladder_score) 
h2010 <- happy[happy$year == 2010 , -(2:3)] %>% arrange(-cantril_ladder_score) %>% invisible()
h2020 <- happy[happy$year == 2020, -(2:3)] %>% arrange(-cantril_ladder_score) %>% invisible()
# renaming column name 
colnames(h2010)[2] <- "score2010" 
colnames(h2020)[2] <- "score2020"
# selecting the top & bottom 5 rows
head(h2010, 5); top_n(h2010, -5)
## # A tibble: 5 × 2
##   entity      score2010
##   <chr>           <dbl>
## 1 Switzerland      7.78
## 2 Norway           7.68
## 3 Iceland          7.59
## 4 Sweden           7.56
## 5 Denmark          7.52
## Selecting by score2010
## # A tibble: 5 × 2
##   entity     score2010
##   <chr>          <dbl>
## 1 Guinea          3.65
## 2 Madagascar      3.55
## 3 Rwanda          3.33
## 4 Benin           3.19
## 5 Syria           3.16
head(h2020, 5); top_n(h2020, -5) 
## # A tibble: 5 × 2
##   entity      score2020
##   <chr>           <dbl>
## 1 Finland          7.82
## 2 Denmark          7.64
## 3 Iceland          7.56
## 4 Switzerland      7.51
## 5 Netherlands      7.41
## Selecting by score2020
## # A tibble: 5 × 2
##   entity      score2020
##   <chr>           <dbl>
## 1 Botswana         3.47
## 2 Rwanda           3.27
## 3 Zimbabwe         3.00
## 4 Lebanon          2.96
## 5 Afghanistan      2.40

(c) Find 5 countries with the biggest absolute change (both increase and decrease) of happiness between 2010 and 2020. Take care of countries where data for 2010 or 2020 is missing!

# merging both dataframes on the country column
h_2010_2020 <- merge(h2010, h2020, by="entity", all.x = TRUE)
h_2010_2020
##                           entity score2010 score2020
## 1                    Afghanistan  3.782938    2.4038
## 2                        Albania  5.510124    5.1988
## 3                        Algeria  5.604596    5.1223
## 4                         Angola  4.360250        NA
## 5                      Argentina  6.468387    5.9670
## 6                        Armenia  4.319712    5.3986
## 7                      Australia  7.195586    7.1621
## 8                        Austria  7.400689    7.1630
## 9                     Azerbaijan  4.910772    5.1734
## 10                       Bahrain  5.027187    6.6469
## 11                    Bangladesh  4.724444    5.1555
## 12                       Belarus  5.749043    5.8215
## 13                       Belgium  6.935122    6.8050
## 14                         Benin  3.193469    4.6232
## 15                       Bolivia  6.018895    5.6003
## 16        Bosnia and Herzegovina  4.773145    5.7680
## 17                      Botswana  4.835939    3.4711
## 18                        Brazil  6.660004    6.2928
## 19                      Bulgaria  4.222297    5.3709
## 20                  Burkina Faso  3.955008    4.6705
## 21                      Cambodia  3.898707    4.6403
## 22                      Cameroon  4.244634    5.0476
## 23                        Canada  7.415144    7.0251
## 24                          Chad  4.032975    4.2508
## 25                         Chile  6.599129    6.1718
## 26                         China  5.094917    5.5853
## 27                      Colombia  6.374880    5.7806
## 28                       Comoros  3.955640    4.6086
## 29                         Congo  3.919342    5.0752
## 30                    Costa Rica  7.272250    6.5820
## 31                       Croatia  6.027635    6.1252
## 32                        Cyprus  6.180507    6.2209
## 33                       Czechia  6.334149    6.9198
## 34  Democratic Republic of Congo  4.639227        NA
## 35                       Denmark  7.519909    7.6362
## 36            Dominican Republic  4.753311    5.7372
## 37                       Ecuador  5.960716    5.5333
## 38                         Egypt  4.204157    4.2877
## 39                   El Salvador  5.934372    6.1200
## 40                       Estonia  5.363928    6.3408
## 41                      Ethiopia  4.561169    4.2415
## 42                       Finland  7.420209    7.8210
## 43                        France  6.649365    6.6867
## 44                         Gabon  3.972059    4.9583
## 45                       Georgia  4.254446    4.9732
## 46                       Germany  6.702362    7.0341
## 47                         Ghana  5.057262    4.8720
## 48                        Greece  5.096354    5.9482
## 49                     Guatemala  5.855717    6.2622
## 50                        Guinea  3.651555    4.8905
## 51                         Haiti  4.413475        NA
## 52                      Honduras  4.602218    6.0221
## 53                     Hong Kong  5.483765    5.4248
## 54                       Hungary  4.683358    6.0859
## 55                       Iceland  7.590660    7.5575
## 56                         India  4.720147    3.7771
## 57                     Indonesia  5.367774    5.2396
## 58                          Iran  4.608928    4.8879
## 59                          Iraq  4.659509    4.9409
## 60                       Ireland  6.964645    7.0408
## 61                        Israel  7.110855    7.3638
## 62                         Italy  5.839314    6.4667
## 63                         Japan  5.968216    6.0389
## 64                        Jordan  5.131996    4.1521
## 65                    Kazakhstan  5.759469    6.2341
## 66                         Kenya  4.547335    4.5430
## 67                        Kosovo  5.639588    6.4551
## 68                        Kuwait  6.221095    6.1061
## 69                    Kyrgyzstan  5.207786    5.8285
## 70                          Laos  4.876085    5.1403
## 71                        Latvia  5.125025    6.1804
## 72                       Lebanon  4.572567    2.9553
## 73                         Libya  5.754394    5.3302
## 74                     Lithuania  5.771037    6.4456
## 75                    Luxembourg  6.964097    7.4040
## 76                    Madagascar  3.550610    4.3391
## 77                        Malawi  4.279270    3.7504
## 78                      Malaysia  5.914284    5.7113
## 79                          Mali  4.313017    4.4790
## 80                         Malta  5.962872    6.4469
## 81                    Mauritania  4.673204    4.1526
## 82                        Mexico  7.320185    6.1276
## 83                       Moldova  5.995713    5.8568
## 84                      Mongolia  4.885150    5.7607
## 85                    Montenegro  5.218724    5.5468
## 86                       Morocco  4.969657    5.0599
## 87                       Myanmar  4.438940    4.3942
## 88                         Nepal  4.233245    5.3765
## 89                   Netherlands  7.470715    7.4149
## 90                   New Zealand  7.249630    7.1998
## 91                     Nicaragua  5.448006    6.1646
## 92                         Niger  3.798088    5.0035
## 93                       Nigeria  5.492954    4.5520
## 94               North Macedonia  4.639647    5.1990
## 95               Northern Cyprus  5.463305    5.4666
## 96                        Norway  7.678277    7.3651
## 97                      Pakistan  5.131565    4.5158
## 98                     Palestine  4.646608    4.4825
## 99                        Panama  6.859836    6.3091
## 100                     Paraguay  5.820058    5.5783
## 101                         Peru  5.824557    5.5591
## 102                  Philippines  5.001965    5.9036
## 103                       Poland  5.875932    6.1228
## 104                     Portugal  4.993962    6.0163
## 105                        Qatar  6.611299        NA
## 106                      Romania  5.166875    6.4770
## 107                       Russia  5.620736    5.4586
## 108                       Rwanda  3.333048    3.2682
## 109                 Saudi Arabia  6.396359    6.5228
## 110                      Senegal  3.668737    5.0461
## 111                       Serbia  5.154522    6.1780
## 112                     Slovakia  5.911059    6.3907
## 113                     Slovenia  6.062891    6.6301
## 114            Somaliland region  5.057314        NA
## 115                 South Africa  5.133888    5.1939
## 116                  South Korea  6.003287    5.9351
## 117                        Spain  6.290690    6.4763
## 118                    Sri Lanka  4.224593    4.3622
## 119                        Sudan  4.550499        NA
## 120                     Suriname  6.269287        NA
## 121                       Sweden  7.560148    7.3843
## 122                  Switzerland  7.776209    7.5116
## 123                        Syria  3.164491        NA
## 124                       Taiwan  6.125917    6.5125
## 125                   Tajikistan  4.496571    5.3771
## 126                     Tanzania  4.006897    3.7024
## 127                     Thailand  6.300235    5.8912
## 128                      Tunisia  4.463531    4.5160
## 129                       Turkey  5.309076    4.7442
## 130                 Turkmenistan  5.463827    5.4743
## 131                       Uganda  4.309238    4.6026
## 132                      Ukraine  5.030342    5.0841
## 133         United Arab Emirates  7.217767    6.5760
## 134               United Kingdom  6.880784    6.9425
## 135                United States  7.026227    6.9768
## 136                      Uruguay  6.449728    6.4736
## 137                   Uzbekistan  6.019332    6.0627
## 138                    Venezuela  7.066577    4.9255
## 139                      Vietnam  5.534570    5.4850
## 140                        Yemen  4.060601    4.1969
## 141                       Zambia  5.013375    3.7598
## 142                     Zimbabwe  4.955101    2.9951
# other option: joining with SQL statement
# renaming colnames s.t. we remove ambiguity 
# (joinin won't work if columns have the same names)
colnames(h2010) <- c("entity2010", "score2010")
colnames(h2020) <- c("entity2020","score2020")
library(sqldf)
sqldf("
      SELECT *
      FROM h2010 
      LEFT JOIN h2020
      ON entity2010 = entity2020
      ") 
##                       entity2010 score2010             entity2020 score2020
## 1                    Switzerland  7.776209            Switzerland    7.5116
## 2                         Norway  7.678277                 Norway    7.3651
## 3                        Iceland  7.590660                Iceland    7.5575
## 4                         Sweden  7.560148                 Sweden    7.3843
## 5                        Denmark  7.519909                Denmark    7.6362
## 6                    Netherlands  7.470715            Netherlands    7.4149
## 7                        Finland  7.420209                Finland    7.8210
## 8                         Canada  7.415144                 Canada    7.0251
## 9                        Austria  7.400689                Austria    7.1630
## 10                        Mexico  7.320185                 Mexico    6.1276
## 11                    Costa Rica  7.272250             Costa Rica    6.5820
## 12                   New Zealand  7.249630            New Zealand    7.1998
## 13          United Arab Emirates  7.217767   United Arab Emirates    6.5760
## 14                     Australia  7.195586              Australia    7.1621
## 15                        Israel  7.110855                 Israel    7.3638
## 16                     Venezuela  7.066577              Venezuela    4.9255
## 17                 United States  7.026227          United States    6.9768
## 18                       Ireland  6.964645                Ireland    7.0408
## 19                    Luxembourg  6.964097             Luxembourg    7.4040
## 20                       Belgium  6.935122                Belgium    6.8050
## 21                United Kingdom  6.880784         United Kingdom    6.9425
## 22                        Panama  6.859836                 Panama    6.3091
## 23                       Germany  6.702362                Germany    7.0341
## 24                        Brazil  6.660004                 Brazil    6.2928
## 25                        France  6.649365                 France    6.6867
## 26                         Qatar  6.611299                   <NA>        NA
## 27                         Chile  6.599129                  Chile    6.1718
## 28                     Argentina  6.468387              Argentina    5.9670
## 29                       Uruguay  6.449728                Uruguay    6.4736
## 30                  Saudi Arabia  6.396359           Saudi Arabia    6.5228
## 31                      Colombia  6.374880               Colombia    5.7806
## 32                       Czechia  6.334149                Czechia    6.9198
## 33                      Thailand  6.300235               Thailand    5.8912
## 34                         Spain  6.290690                  Spain    6.4763
## 35                      Suriname  6.269287                   <NA>        NA
## 36                        Kuwait  6.221095                 Kuwait    6.1061
## 37                        Cyprus  6.180507                 Cyprus    6.2209
## 38                        Taiwan  6.125917                 Taiwan    6.5125
## 39                      Slovenia  6.062891               Slovenia    6.6301
## 40                       Croatia  6.027635                Croatia    6.1252
## 41                    Uzbekistan  6.019332             Uzbekistan    6.0627
## 42                       Bolivia  6.018895                Bolivia    5.6003
## 43                   South Korea  6.003287            South Korea    5.9351
## 44                       Moldova  5.995713                Moldova    5.8568
## 45                         Japan  5.968216                  Japan    6.0389
## 46                         Malta  5.962872                  Malta    6.4469
## 47                       Ecuador  5.960716                Ecuador    5.5333
## 48                   El Salvador  5.934372            El Salvador    6.1200
## 49                      Malaysia  5.914284               Malaysia    5.7113
## 50                      Slovakia  5.911059               Slovakia    6.3907
## 51                        Poland  5.875932                 Poland    6.1228
## 52                     Guatemala  5.855717              Guatemala    6.2622
## 53                         Italy  5.839314                  Italy    6.4667
## 54                          Peru  5.824557                   Peru    5.5591
## 55                      Paraguay  5.820058               Paraguay    5.5783
## 56                     Lithuania  5.771037              Lithuania    6.4456
## 57                    Kazakhstan  5.759469             Kazakhstan    6.2341
## 58                         Libya  5.754394                  Libya    5.3302
## 59                       Belarus  5.749043                Belarus    5.8215
## 60                        Kosovo  5.639588                 Kosovo    6.4551
## 61                        Russia  5.620736                 Russia    5.4586
## 62                       Algeria  5.604596                Algeria    5.1223
## 63                       Vietnam  5.534570                Vietnam    5.4850
## 64                       Albania  5.510124                Albania    5.1988
## 65                       Nigeria  5.492954                Nigeria    4.5520
## 66                     Hong Kong  5.483765              Hong Kong    5.4248
## 67                  Turkmenistan  5.463827           Turkmenistan    5.4743
## 68               Northern Cyprus  5.463305        Northern Cyprus    5.4666
## 69                     Nicaragua  5.448006              Nicaragua    6.1646
## 70                     Indonesia  5.367774              Indonesia    5.2396
## 71                       Estonia  5.363928                Estonia    6.3408
## 72                        Turkey  5.309076                 Turkey    4.7442
## 73                    Montenegro  5.218724             Montenegro    5.5468
## 74                    Kyrgyzstan  5.207786             Kyrgyzstan    5.8285
## 75                       Romania  5.166875                Romania    6.4770
## 76                        Serbia  5.154522                 Serbia    6.1780
## 77                  South Africa  5.133888           South Africa    5.1939
## 78                        Jordan  5.131996                 Jordan    4.1521
## 79                      Pakistan  5.131565               Pakistan    4.5158
## 80                        Latvia  5.125025                 Latvia    6.1804
## 81                        Greece  5.096354                 Greece    5.9482
## 82                         China  5.094917                  China    5.5853
## 83             Somaliland region  5.057314                   <NA>        NA
## 84                         Ghana  5.057262                  Ghana    4.8720
## 85                       Ukraine  5.030342                Ukraine    5.0841
## 86                       Bahrain  5.027187                Bahrain    6.6469
## 87                        Zambia  5.013375                 Zambia    3.7598
## 88                   Philippines  5.001965            Philippines    5.9036
## 89                      Portugal  4.993962               Portugal    6.0163
## 90                       Morocco  4.969657                Morocco    5.0599
## 91                      Zimbabwe  4.955101               Zimbabwe    2.9951
## 92                    Azerbaijan  4.910772             Azerbaijan    5.1734
## 93                      Mongolia  4.885150               Mongolia    5.7607
## 94                          Laos  4.876085                   Laos    5.1403
## 95                      Botswana  4.835939               Botswana    3.4711
## 96        Bosnia and Herzegovina  4.773145 Bosnia and Herzegovina    5.7680
## 97            Dominican Republic  4.753311     Dominican Republic    5.7372
## 98                    Bangladesh  4.724444             Bangladesh    5.1555
## 99                         India  4.720147                  India    3.7771
## 100                      Hungary  4.683358                Hungary    6.0859
## 101                   Mauritania  4.673204             Mauritania    4.1526
## 102                         Iraq  4.659509                   Iraq    4.9409
## 103                    Palestine  4.646608              Palestine    4.4825
## 104              North Macedonia  4.639647        North Macedonia    5.1990
## 105 Democratic Republic of Congo  4.639227                   <NA>        NA
## 106                         Iran  4.608928                   Iran    4.8879
## 107                     Honduras  4.602218               Honduras    6.0221
## 108                      Lebanon  4.572567                Lebanon    2.9553
## 109                     Ethiopia  4.561169               Ethiopia    4.2415
## 110                        Sudan  4.550499                   <NA>        NA
## 111                        Kenya  4.547335                  Kenya    4.5430
## 112                   Tajikistan  4.496571             Tajikistan    5.3771
## 113                      Tunisia  4.463531                Tunisia    4.5160
## 114                      Myanmar  4.438940                Myanmar    4.3942
## 115                        Haiti  4.413475                   <NA>        NA
## 116                       Angola  4.360250                   <NA>        NA
## 117                      Armenia  4.319712                Armenia    5.3986
## 118                         Mali  4.313017                   Mali    4.4790
## 119                       Uganda  4.309238                 Uganda    4.6026
## 120                       Malawi  4.279270                 Malawi    3.7504
## 121                      Georgia  4.254446                Georgia    4.9732
## 122                     Cameroon  4.244634               Cameroon    5.0476
## 123                        Nepal  4.233245                  Nepal    5.3765
## 124                    Sri Lanka  4.224593              Sri Lanka    4.3622
## 125                     Bulgaria  4.222297               Bulgaria    5.3709
## 126                        Egypt  4.204157                  Egypt    4.2877
## 127                        Yemen  4.060601                  Yemen    4.1969
## 128                         Chad  4.032975                   Chad    4.2508
## 129                     Tanzania  4.006897               Tanzania    3.7024
## 130                        Gabon  3.972059                  Gabon    4.9583
## 131                      Comoros  3.955640                Comoros    4.6086
## 132                 Burkina Faso  3.955008           Burkina Faso    4.6705
## 133                        Congo  3.919342                  Congo    5.0752
## 134                     Cambodia  3.898707               Cambodia    4.6403
## 135                        Niger  3.798088                  Niger    5.0035
## 136                  Afghanistan  3.782938            Afghanistan    2.4038
## 137                      Senegal  3.668737                Senegal    5.0461
## 138                       Guinea  3.651555                 Guinea    4.8905
## 139                   Madagascar  3.550610             Madagascar    4.3391
## 140                       Rwanda  3.333048                 Rwanda    3.2682
## 141                        Benin  3.193469                  Benin    4.6232
## 142                        Syria  3.164491                   <NA>        NA
h_2010_2020 <- h_2010_2020 %>% 
  mutate(neg_difference = (score2020-score2010),
         pos_difference = (score2020-score2010))
# negative difference = happyness score decreased from 2010 to 2020
h_2010_2020 %>% select(entity, neg_difference) %>%  arrange(neg_difference);
##                           entity neg_difference
## 1                      Venezuela     -2.1410774
## 2                       Zimbabwe     -1.9600005
## 3                        Lebanon     -1.6172670
## 4                    Afghanistan     -1.3791375
## 5                       Botswana     -1.3648390
## 6                         Zambia     -1.2535750
## 7                         Mexico     -1.1925850
## 8                         Jordan     -0.9798960
## 9                          India     -0.9430467
## 10                       Nigeria     -0.9409543
## 11                    Costa Rica     -0.6902500
## 12          United Arab Emirates     -0.6417670
## 13                      Pakistan     -0.6157650
## 14                      Colombia     -0.5942800
## 15                        Turkey     -0.5648763
## 16                        Panama     -0.5507356
## 17                        Malawi     -0.5288697
## 18                    Mauritania     -0.5206040
## 19                     Argentina     -0.5013870
## 20                       Algeria     -0.4822957
## 21                       Ecuador     -0.4274162
## 22                         Chile     -0.4273287
## 23                         Libya     -0.4241940
## 24                       Bolivia     -0.4185947
## 25                      Thailand     -0.4090353
## 26                        Canada     -0.3900444
## 27                        Brazil     -0.3672037
## 28                      Ethiopia     -0.3196687
## 29                        Norway     -0.3131770
## 30                       Albania     -0.3113240
## 31                      Tanzania     -0.3044974
## 32                          Peru     -0.2654573
## 33                   Switzerland     -0.2646090
## 34                      Paraguay     -0.2417583
## 35                       Austria     -0.2376886
## 36                      Malaysia     -0.2029838
## 37                         Ghana     -0.1852620
## 38                        Sweden     -0.1758480
## 39                     Palestine     -0.1641084
## 40                        Russia     -0.1621356
## 41                       Moldova     -0.1389128
## 42                       Belgium     -0.1301220
## 43                     Indonesia     -0.1281740
## 44                        Kuwait     -0.1149946
## 45                   South Korea     -0.0681870
## 46                        Rwanda     -0.0648479
## 47                     Hong Kong     -0.0589646
## 48                   Netherlands     -0.0558155
## 49                   New Zealand     -0.0498300
## 50                       Vietnam     -0.0495697
## 51                 United States     -0.0494270
## 52                       Myanmar     -0.0447396
## 53                     Australia     -0.0334857
## 54                       Iceland     -0.0331600
## 55                         Kenya     -0.0043350
## 56               Northern Cyprus      0.0032945
## 57                  Turkmenistan      0.0104730
## 58                       Uruguay      0.0238715
## 59                        France      0.0373346
## 60                        Cyprus      0.0403930
## 61                    Uzbekistan      0.0433680
## 62                       Tunisia      0.0524690
## 63                       Ukraine      0.0537580
## 64                  South Africa      0.0600120
## 65                United Kingdom      0.0617160
## 66                         Japan      0.0706836
## 67                       Belarus      0.0724565
## 68                       Ireland      0.0761546
## 69                         Egypt      0.0835430
## 70                       Morocco      0.0902435
## 71                       Croatia      0.0975654
## 72                       Denmark      0.1162906
## 73                  Saudi Arabia      0.1264406
## 74                         Yemen      0.1362992
## 75                     Sri Lanka      0.1376070
## 76                          Mali      0.1659830
## 77                         Spain      0.1856096
## 78                   El Salvador      0.1856285
## 79                          Chad      0.2178253
## 80                        Poland      0.2468683
## 81                        Israel      0.2529454
## 82                    Azerbaijan      0.2626280
## 83                          Laos      0.2642150
## 84                          Iran      0.2789723
## 85                          Iraq      0.2813913
## 86                        Uganda      0.2933620
## 87                    Montenegro      0.3280757
## 88                       Germany      0.3317380
## 89                        Taiwan      0.3865830
## 90                       Finland      0.4007906
## 91                     Guatemala      0.4064830
## 92                    Bangladesh      0.4310560
## 93                    Luxembourg      0.4399030
## 94                    Kazakhstan      0.4746305
## 95                      Slovakia      0.4796406
## 96                         Malta      0.4840280
## 97                         China      0.4903827
## 98               North Macedonia      0.5593525
## 99                      Slovenia      0.5672090
## 100                      Czechia      0.5856506
## 101                   Kyrgyzstan      0.6207144
## 102                        Italy      0.6273860
## 103                      Comoros      0.6529597
## 104                    Lithuania      0.6745630
## 105                 Burkina Faso      0.7154920
## 106                    Nicaragua      0.7165940
## 107                      Georgia      0.7187544
## 108                     Cambodia      0.7415930
## 109                   Madagascar      0.7884904
## 110                     Cameroon      0.8029660
## 111                       Kosovo      0.8155116
## 112                       Greece      0.8518460
## 113                     Mongolia      0.8755496
## 114                   Tajikistan      0.8805285
## 115                  Philippines      0.9016350
## 116                      Estonia      0.9768720
## 117           Dominican Republic      0.9838890
## 118                        Gabon      0.9862408
## 119       Bosnia and Herzegovina      0.9948553
## 120                     Portugal      1.0223377
## 121                       Serbia      1.0234780
## 122                       Latvia      1.0553747
## 123                      Armenia      1.0788883
## 124                        Nepal      1.1432550
## 125                     Bulgaria      1.1486030
## 126                        Congo      1.1558582
## 127                        Niger      1.2054117
## 128                       Guinea      1.2389452
## 129                      Romania      1.3101250
## 130                      Senegal      1.3773630
## 131                      Hungary      1.4025420
## 132                     Honduras      1.4198820
## 133                        Benin      1.4297312
## 134                      Bahrain      1.6197130
## 135                       Angola             NA
## 136 Democratic Republic of Congo             NA
## 137                        Haiti             NA
## 138                        Qatar             NA
## 139            Somaliland region             NA
## 140                        Sudan             NA
## 141                     Suriname             NA
## 142                        Syria             NA
# positive difference = happyness score increased from 2010 to 2020
h_2010_2020 %>% select(entity, pos_difference) %>%  arrange(-pos_difference)
##                           entity pos_difference
## 1                        Bahrain      1.6197130
## 2                          Benin      1.4297312
## 3                       Honduras      1.4198820
## 4                        Hungary      1.4025420
## 5                        Senegal      1.3773630
## 6                        Romania      1.3101250
## 7                         Guinea      1.2389452
## 8                          Niger      1.2054117
## 9                          Congo      1.1558582
## 10                      Bulgaria      1.1486030
## 11                         Nepal      1.1432550
## 12                       Armenia      1.0788883
## 13                        Latvia      1.0553747
## 14                        Serbia      1.0234780
## 15                      Portugal      1.0223377
## 16        Bosnia and Herzegovina      0.9948553
## 17                         Gabon      0.9862408
## 18            Dominican Republic      0.9838890
## 19                       Estonia      0.9768720
## 20                   Philippines      0.9016350
## 21                    Tajikistan      0.8805285
## 22                      Mongolia      0.8755496
## 23                        Greece      0.8518460
## 24                        Kosovo      0.8155116
## 25                      Cameroon      0.8029660
## 26                    Madagascar      0.7884904
## 27                      Cambodia      0.7415930
## 28                       Georgia      0.7187544
## 29                     Nicaragua      0.7165940
## 30                  Burkina Faso      0.7154920
## 31                     Lithuania      0.6745630
## 32                       Comoros      0.6529597
## 33                         Italy      0.6273860
## 34                    Kyrgyzstan      0.6207144
## 35                       Czechia      0.5856506
## 36                      Slovenia      0.5672090
## 37               North Macedonia      0.5593525
## 38                         China      0.4903827
## 39                         Malta      0.4840280
## 40                      Slovakia      0.4796406
## 41                    Kazakhstan      0.4746305
## 42                    Luxembourg      0.4399030
## 43                    Bangladesh      0.4310560
## 44                     Guatemala      0.4064830
## 45                       Finland      0.4007906
## 46                        Taiwan      0.3865830
## 47                       Germany      0.3317380
## 48                    Montenegro      0.3280757
## 49                        Uganda      0.2933620
## 50                          Iraq      0.2813913
## 51                          Iran      0.2789723
## 52                          Laos      0.2642150
## 53                    Azerbaijan      0.2626280
## 54                        Israel      0.2529454
## 55                        Poland      0.2468683
## 56                          Chad      0.2178253
## 57                   El Salvador      0.1856285
## 58                         Spain      0.1856096
## 59                          Mali      0.1659830
## 60                     Sri Lanka      0.1376070
## 61                         Yemen      0.1362992
## 62                  Saudi Arabia      0.1264406
## 63                       Denmark      0.1162906
## 64                       Croatia      0.0975654
## 65                       Morocco      0.0902435
## 66                         Egypt      0.0835430
## 67                       Ireland      0.0761546
## 68                       Belarus      0.0724565
## 69                         Japan      0.0706836
## 70                United Kingdom      0.0617160
## 71                  South Africa      0.0600120
## 72                       Ukraine      0.0537580
## 73                       Tunisia      0.0524690
## 74                    Uzbekistan      0.0433680
## 75                        Cyprus      0.0403930
## 76                        France      0.0373346
## 77                       Uruguay      0.0238715
## 78                  Turkmenistan      0.0104730
## 79               Northern Cyprus      0.0032945
## 80                         Kenya     -0.0043350
## 81                       Iceland     -0.0331600
## 82                     Australia     -0.0334857
## 83                       Myanmar     -0.0447396
## 84                 United States     -0.0494270
## 85                       Vietnam     -0.0495697
## 86                   New Zealand     -0.0498300
## 87                   Netherlands     -0.0558155
## 88                     Hong Kong     -0.0589646
## 89                        Rwanda     -0.0648479
## 90                   South Korea     -0.0681870
## 91                        Kuwait     -0.1149946
## 92                     Indonesia     -0.1281740
## 93                       Belgium     -0.1301220
## 94                       Moldova     -0.1389128
## 95                        Russia     -0.1621356
## 96                     Palestine     -0.1641084
## 97                        Sweden     -0.1758480
## 98                         Ghana     -0.1852620
## 99                      Malaysia     -0.2029838
## 100                      Austria     -0.2376886
## 101                     Paraguay     -0.2417583
## 102                  Switzerland     -0.2646090
## 103                         Peru     -0.2654573
## 104                     Tanzania     -0.3044974
## 105                      Albania     -0.3113240
## 106                       Norway     -0.3131770
## 107                     Ethiopia     -0.3196687
## 108                       Brazil     -0.3672037
## 109                       Canada     -0.3900444
## 110                     Thailand     -0.4090353
## 111                      Bolivia     -0.4185947
## 112                        Libya     -0.4241940
## 113                        Chile     -0.4273287
## 114                      Ecuador     -0.4274162
## 115                      Algeria     -0.4822957
## 116                    Argentina     -0.5013870
## 117                   Mauritania     -0.5206040
## 118                       Malawi     -0.5288697
## 119                       Panama     -0.5507356
## 120                       Turkey     -0.5648763
## 121                     Colombia     -0.5942800
## 122                     Pakistan     -0.6157650
## 123         United Arab Emirates     -0.6417670
## 124                   Costa Rica     -0.6902500
## 125                      Nigeria     -0.9409543
## 126                        India     -0.9430467
## 127                       Jordan     -0.9798960
## 128                       Mexico     -1.1925850
## 129                       Zambia     -1.2535750
## 130                     Botswana     -1.3648390
## 131                  Afghanistan     -1.3791375
## 132                      Lebanon     -1.6172670
## 133                     Zimbabwe     -1.9600005
## 134                    Venezuela     -2.1410774
## 135                       Angola             NA
## 136 Democratic Republic of Congo             NA
## 137                        Haiti             NA
## 138                        Qatar             NA
## 139            Somaliland region             NA
## 140                        Sudan             NA
## 141                     Suriname             NA
## 142                        Syria             NA

5

1.5 Visual manipulation

(a) Construct three bad or inappropriate visual representations of the data. Inappropriate in the meaning as discussed in the lecture (insufficient informationon data, poor quality, etc.).

ggplot(diamonds, aes(cut, carat), color=factor(price)) +
  geom_point() 

ggplot(diamonds, aes(cut, carat), color=factor(price)) +
  geom_line() +
  facet_wrap(~depth)
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

(b) You can use any data prepared for this exercise. You can also use a new dataset. In addition, you can download annual data on causes of death in a country of your interest from Eurostat. Information on data: standardized death rate [HLTH CD ASDR], total population regarding sex and age, from 1994 to 2010.

death <- read.csv("https://raw.githubusercontent.com/MarkusStefan/Data_Analytics/main/Exercise1/Death_Causes_Eurostat.csv")
pie(x=death[,1], main="EU27 deaths")
legend("topright", inset = 0.05, title = "Legend", legend = death$TIME, fill=rainbow(factor(death$TIME)))

for (c in 2:ncol(death)){
  if (c==2){
    pool <- data.frame(death$TIME, death[, c], rep(colnames(death)[c], 10))
  }
  else {
    pool <- pool %>% rbind(data.frame(death$TIME, death[, c], rep(colnames(death)[c], 10)))
  }
}
pool <- pool %>% clean_names()
ggplot(pool, aes(x=pool$death_time) )+
  geom_point(aes(y=pool$death_c)) +
  facet_wrap(~pool$rep_colnames_death_c_10)
## Warning: Use of `pool$death_c` is discouraged. Use `death_c` instead.
## Warning: Use of `pool$death_time` is discouraged. Use `death_time` instead.
## Warning: Removed 325 rows containing missing values (geom_point).

barplot(height = pool$death_c, width=0.001)

ggplot(pool, aes(x=pool$death_time, y=pool$death_c)) +
  geom_point(col=2) +
  stat_smooth(method='lm')
## Warning: Use of `pool$death_time` is discouraged. Use `death_time` instead.
## Warning: Use of `pool$death_c` is discouraged. Use `death_c` instead.
## Warning: Use of `pool$death_time` is discouraged. Use `death_time` instead.
## Warning: Use of `pool$death_c` is discouraged. Use `death_c` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 325 rows containing non-finite values (stat_smooth).
## Warning: Removed 325 rows containing missing values (geom_point).